From: Yves-Alexis Perez Date: Fri, 2 Feb 2018 08:46:54 +0000 (+0100) Subject: Revert "mm: fix 100% CPU kswapd busyloop on unreclaimable nodes" X-Git-Tag: archive/raspbian/4.9.80-2+rpi1~8^2~1 X-Git-Url: https://dgit.raspbian.org/%22http://www.example.com/cgi/%22/%22http:/www.example.com/cgi/%22?a=commitdiff_plain;h=45a8a21fe75fb28099e07b2ddda4b9e1e6c616e4;p=linux-4.9.git Revert "mm: fix 100% CPU kswapd busyloop on unreclaimable nodes" This reverts commit 19a7db1e2ef38865a704ea4dfd178b02a8026ada which is c73322d098e4b6f5f0f0fa1330bf57e218775539 upstream. By adding a new field into struct pglist_data it changes the ABI. Since the problem doesn't seem to occur often, revert the change for now. Gbp-Pq: Topic debian Gbp-Pq: Name revert-mm-fix-100-CPU-kswapd-busyloop-on-unreclaimab.patch --- diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 65a686a7bf34..1192eb029c5b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -633,8 +633,6 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_classzone_idx; - int kswapd_failures; /* Number of 'reclaimed == 0' runs */ - #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_classzone_idx; diff --git a/mm/internal.h b/mm/internal.h index 3e2d01694747..34a5459e5989 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -73,12 +73,6 @@ static inline void set_page_refcounted(struct page *page) extern unsigned long highest_memmap_pfn; -/* - * Maximum number of reclaim retries without progress before the OOM - * killer is consider the only way forward. - */ -#define MAX_RECLAIM_RETRIES 16 - /* * in mm/vmscan.c: */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94018ea5f935..546713b3f762 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3421,6 +3421,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return false; } +/* + * Maximum number of reclaim retries without any progress before OOM killer + * is consider as the only way to move forward. + */ +#define MAX_RECLAIM_RETRIES 16 + /* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. @@ -4379,8 +4385,7 @@ void show_free_areas(unsigned int filter) K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), node_page_state(pgdat, NR_PAGES_SCANNED), - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? - "yes" : "no"); + !pgdat_reclaimable(pgdat) ? "yes" : "no"); } for_each_populated_zone(zone) { diff --git a/mm/vmscan.c b/mm/vmscan.c index f118dc23f662..30a88b945a44 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2606,15 +2606,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); - /* - * Kswapd gives up on balancing particular nodes after too - * many failures to reclaim anything from them and goes to - * sleep. On reclaim progress, reset the failure counter. A - * successful direct reclaim run will revive a dormant kswapd. - */ - if (reclaimable) - pgdat->kswapd_failures = 0; - return reclaimable; } @@ -2689,6 +2680,10 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; + if (sc->priority != DEF_PRIORITY && + !pgdat_reclaimable(zone->zone_pgdat)) + continue; /* Let kswapd poll it */ + /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2825,7 +2820,7 @@ retry: return 0; } -static bool allow_direct_reclaim(pg_data_t *pgdat) +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -2833,9 +2828,6 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) int i; bool wmark_ok; - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) - return true; - for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; if (!managed_zone(zone) || @@ -2916,7 +2908,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (allow_direct_reclaim(pgdat)) + if (pfmemalloc_watermark_ok(pgdat)) goto out; break; } @@ -2938,14 +2930,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat), HZ); + pfmemalloc_watermark_ok(pgdat), HZ); goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat)); + pfmemalloc_watermark_ok(pgdat)); check_pending: if (fatal_signal_pending(current)) @@ -3124,7 +3116,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) /* * The throttled processes are normally woken up in balance_pgdat() as - * soon as allow_direct_reclaim() is true. But there is a potential + * soon as pfmemalloc_watermark_ok() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the @@ -3138,10 +3130,6 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); - /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) - return true; - for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; @@ -3228,9 +3216,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); do { - unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + sc.nr_reclaimed = 0; sc.reclaim_idx = classzone_idx; /* @@ -3309,7 +3297,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - allow_direct_reclaim(pgdat)) + pfmemalloc_watermark_ok(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ @@ -3320,14 +3308,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ - nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; - if (raise_priority || !nr_reclaimed) + if (raise_priority || !sc.nr_reclaimed) sc.priority--; } while (sc.priority >= 1); - if (!sc.nr_reclaimed) - pgdat->kswapd_failures++; - out: /* * Return the order kswapd stopped reclaiming at as @@ -3527,10 +3511,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!waitqueue_active(&pgdat->kswapd_wait)) return; - /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) - return; - /* Only wake kswapd if all zones are unbalanced */ for (z = 0; z <= classzone_idx; z++) { zone = pgdat->node_zones + z; @@ -3801,6 +3781,9 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; + if (!pgdat_reclaimable(pgdat)) + return NODE_RECLAIM_FULL; + /* * Do not scan if the allocation should not be delayed. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 3863b5d6d598..6a088df04b29 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1421,7 +1421,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n node_unreclaimable: %u" "\n start_pfn: %lu" "\n node_inactive_ratio: %u", - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, + !pgdat_reclaimable(zone->zone_pgdat), zone->zone_start_pfn, zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n');